/*
  author Sylvain Bertrand <sylvain.bertrand@gmail.com>
  Protected by linux GNU GPLv2
  Copyright 2012-2014
*/
#include <linux/module.h>
#include <linux/pci.h>
#include <linux/firmware.h>
#include <linux/delay.h>
#include <linux/cdev.h>

#include <alga/rng_mng.h>
#include <alga/timing.h>
#include <uapi/alga/pixel_fmts.h>
#include <uapi/alga/amd/dce6/dce6.h>

#include <uapi/alga/amd/si/pkt.h>

#include "mc.h"
#include "rlc.h"
#include "ih.h"
#include "fence.h"
#include "ring.h"
#include "dmas.h"
#include "ba.h"
#include "cps.h"
#include "gpu.h"
#include "drv.h"

#include "ucode.h"

#include "regs.h"

#define PFP_FW_DWS	2144
#define ME_FW_DWS	2144
#define CE_FW_DWS	2144

MODULE_FIRMWARE("radeon/TAHITI_pfp.bin");
MODULE_FIRMWARE("radeon/TAHITI_me.bin");
MODULE_FIRMWARE("radeon/TAHITI_ce.bin");
MODULE_FIRMWARE("radeon/PITCAIRN_pfp.bin");
MODULE_FIRMWARE("radeon/PITCAIRN_me.bin");
MODULE_FIRMWARE("radeon/PITCAIRN_ce.bin");
MODULE_FIRMWARE("radeon/VERDE_pfp.bin");
MODULE_FIRMWARE("radeon/VERDE_me.bin");
MODULE_FIRMWARE("radeon/VERDE_ce.bin");
MODULE_FIRMWARE("radeon/OLAND_pfp.bin");
MODULE_FIRMWARE("radeon/OLAND_me.bin");
MODULE_FIRMWARE("radeon/OLAND_ce.bin");

/*
 * Must run code to init the state because of the way the registers are
 * managed. 0xffffffdd are place holder for a cpu computed value.
 */
static u32 ctx_clr_state[] = {
	PKT3(PKT3_SET_CTX_REG, 7),
	CTX_REG_IDX(DB_RENDER_CTL),
	/* DB_RENDER_CTL */
	0x00000060,
	/* DB_CNT_CTL */
	0x00000000,
	/* DB_DEPTH_VIEW */
	0x00000000,
	/* DB_RENDER_OVERRIDE_0 */
	0xfffffff5,
	/* DB_RENDER_OVERRIDE_1 */
	0x00000000,
	/* DB_HTILE_DATA_BASE */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 5),
	CTX_REG_IDX(DB_DEPTH_BOUNDS_MIN),
	/* DB_DEPTH_BOUNDS_MIN */
	0x00000000,
	/* DB_DEPTH_BOUNDS_MAX */
	0x00000000,
	/* DB_STENCIL_CLR */
	0x00000000,
	/* DB_DEPTH_CLR */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 4),
	CTX_REG_IDX(DB_DEPTH_INFO),
	/* DB_DEPTH_INFO */
	0x00000000,
	/* DB_Z_INFO */
	0x00000000,
	/* DB_STENCIL_INFO */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),
	CTX_REG_IDX(PA_SC_WND_OF),
	/* PA_SC_WND_OF */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 14),
	CTX_REG_IDX(PA_SC_CLIPRECT_RULE),
	/* PA_SC_CLIPRECT_RULE */
	0xffffff24,
	/* PA_SC_CLIPRECT_0_TL */
	0x00000000,
	/* PA_SC_CLIPRECT_0_BR */
	0xffffff26,
	/* PA_SC_CLIPRECT_1_TL */
	0x00000000,
	/* PA_SC_CLIPRECT_1_BR */
	0xffffff28,
	/* PA_SC_CLIPRECT_2_TL */
	0x00000000,
	/* PA_SC_CLIPRECT_2_BR */
	0xffffff30,
	/* PA_SC_CLIPRECT_3_TL */
	0x00000000,
	/* PA_SC_CLIPRECT_3_BR */
	0xffffff32,
	/* PA_SC_EDGERULE */
	0xffffff33,
	/* PA_SU_HW_SCR_OF */
	0x00000000,
	/* CB_TGT_MASK */
	0xffffff35,
	/* CB_SHADER_MASK */
	0xffffff36,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 35),
	CTX_REG_IDX(PA_SC_VPORT_0_SCISSOR_TL),
	/* PA_SC_VPORT_0_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_0_SCISSOR_BR */
	0xffffff40,
	/* PA_SC_VPORT_1_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_1_SCISSOR_BR */
	0xffffff42,
	/* PA_SC_VPORT_2_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_2_SCISSOR_BR */
	0xffffff44,
	/* PA_SC_VPORT_3_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_3_SCISSOR_BR */
	0xffffff46,
	/* PA_SC_VPORT_4_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_4_SCISSOR_BR */
	0xffffff48,
	/* PA_SC_VPORT_5_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_5_SCISSOR_BR */
	0xffffff50,
	/* PA_SC_VPORT_6_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_6_SCISSOR_BR */
	0xffffff52,
	/* PA_SC_VPORT_7_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_7_SCISSOR_BR */
	0xffffff54,
	/* PA_SC_VPORT_8_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_8_SCISSOR_BR */
	0xffffff56,
	/* PA_SC_VPORT_9_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_9_SCISSOR_BR */
	0xffffff58,
	/* PA_SC_VPORT_A_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_A_SCISSOR_BR */
	0xffffff60,
	/* PA_SC_VPORT_B_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_B_SCISSOR_BR */
	0xffffff62,
	/* PA_SC_VPORT_C_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_C_SCISSOR_BR */
	0xffffff64,
	/* PA_SC_VPORT_D_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_D_SCISSOR_BR */
	0xffffff66,
	/* PA_SC_VPORT_E_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_E_SCISSOR_BR */
	0xffffff68,
	/* PA_SC_VPORT_F_SCISSOR_TL */
	PSVST_WND_OF_DIS,
	/* PA_SC_VPORT_F_SCISSOR_BR */
	0xffffff70,
	/* PA_SC_VPORT_0_TE_ZMIN */
	0x00000000,
	/* PA_SC_VPORT_0_TE_ZMAX */
	0xffffff72,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(CP_RING_ID),
	/* CP_RING_ID */
	0x00000000,
	/* CP_VM_ID */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 5),
	CTX_REG_IDX(VGT_MAX_VTX_IDX),
	/* VGT_MAX_VTX_IDX */
	0xffffffff,
	/* VGT_MIN_VTX_IDX */
	0x00000000,
	/* VGT_IDX_OF */
	0x00000000,
	/* VGT_MULTI_PRIM_IB_RESET_IDX */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 5),
	CTX_REG_IDX(CB_BLEND_RED),
	/* CB_BLEND_RED */
	0x00000000,
	/* CB_BLEND_GREEN */
	0x00000000,
	/* CB_BLEND_BLUE */
	0x00000000,
	/* CB_BLEND_ALPHA */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),
	CTX_REG_IDX(CB_0_BLEND_CTL),
	/* CB_0_BLEND_CTL: disable blending for CB 0 */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 15),
	CTX_REG_IDX(DB_DEPTH_CTL),
	/* DB_DEPTH_CTL */
	0x00000000,
	/* DB_EQAA */
	0x00000000,
	/* CB_COLOR_CTL */
	0xffffff96,
	/* DB_SHADER_CTL */
	0xffffff97,
	/* PA_CL_CLIP_CTL */
	PCCC_CLIP_DIS,
	/* PA_SU_SC_MODE_CTL */
	PSSMC_FACE,
	/* PA_SC_VPORT_TE_CTL */
	PSVTC_VTX_XY_FMT,
	/* PA_CL_VS_OUT_CTL */
	0x00000000,
	/* PA_CL_NANINF_CTL */
	0x00000000,
	/* PA_SU_LINE_STIPPLE_CTL */
	0x00000000,
	/* PA_SU_LINE_STIPPLE_SCALE */
	0x00000000,
	/* PA_SU_PRIM_FILTER_CTL */
	0x00000000,
	/* SQ_LSTMP_RING_ITEM_SZ */
	0x00000000,
	/* SQ_HSTMP_RING_ITEM_SZ */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 18),
	CTX_REG_IDX(PA_SU_POINT_SZ),
	/* PA_SU_POINT_SZ */
	0x00000000,
	/* PA_SU_POINT_MINMAX */
	0x00000000,
	/* PA_SU_LINE_CTL */
	0xfffff112,
	/* PA_SC_LINE_STIPPLE */
	0x00000000,
	/* VGT_OUTPUT_PATH_CTL */
	0x00000000,
	/* VGT_HOS_CTL */
	0x00000000,
	/* VGT_HOS_MAX_TESS_LVL */
	0x00000000,
	/* VGT_HOS_MIN_TESS_LVL */
	0x00000000,
	/* VGT_HOS_REUSE_DEPTH */
	0x00000000,
	/* VGT_GROUP_PRIM_TYPE */
	0x00000000,
	/* VGT_GROUP_FIRST_DECR */
	0x00000000,
	/* VGT_GROUP_DECR */
	0x00000000,
	/* VGT_GROUP_VECT_0_CTL */
	0x00000000,
	/* VGT_GROUP_VECT_1_CTL */
	0x00000000,
	/* VGT_GROUP_VECT_0_FMT_CTL */
	0x00000000,
	/* VGT_GROUP_VECT_1_FMT_CTL */
	0x00000000,
	/* VGT_GS_MODE */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(PA_SC_MODE_CTL_0),
	/* PA_SC_MODE_CTL_0 */
	0x00000000,
	/* PA_SC_MODE_CTL_1 */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),
	CTX_REG_IDX(VGT_PRIM_ID_ENA),
	/* VGT_PRIM_ID_ENA */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),
	CTX_REG_IDX(VGT_MULTI_PRIM_IB_RESET_ENA),
	/* VGT_MULTI_PRIM_IB_RESET_ENA */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(VGT_INST_STEP_RATE_0),
	/* VGT_INST_STEP_RATE_0 */
	0x00000000,
	/* VGT_INST_STEP_RATE_1 */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(VGT_REUSE_OFF),
	/* VGT_REUSE_OFF */
	0x00000000,
	/* VGT_VTX_CNT_ENA */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),
	CTX_REG_IDX(VGT_SHADER_STAGES_ENA),
	/* VGT_SHADER_STAGES_ENA */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 2),	
	CTX_REG_IDX(DB_ALPHA_TO_MASK),
	/* DB_ALPHA_TO_MASK */
	0xfffff150,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 7),
	CTX_REG_IDX(PA_SU_POLY_OF_DB_FMT_CTL),
	/* PA_SU_POLY_OF_DB_FMT_CTL */
	0x00000000,
	/* PA_SU_POLY_OF_CLAMP */
	0x00000000,
	/* PA_SU_POLY_OF_FRONT_SCALE */
	0x00000000,
	/* PA_SU_POLY_OF_FRONT_OF */
	0x00000000,
	/* PA_SU_POLY_OF_BACK_SCALE */
	0x00000000,
	/* PA_SU_POLY_OF_BACK_OF */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(VGT_STRMOUT_CFG),
	/* VGT_STRMOUT_CFG */
	0x00000000,
	/* VGT_STRMOUT_BUF_CFG */
	0x00000000,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 28),
	CTX_REG_IDX(PA_SC_CENTROID_PRIORITY_0),
	/* PA_SC_CENTROID_PRIORITY_0 */
	0xfffff165,
	/* PA_SC_CENTROID_PRIORITY_1 */
	0xfffff166,
	/* PA_SC_LINE_CTL */
	0x00000000,
	/* PA_SC_AA_CFG */
	0x00000000,
	/* PA_SU_VTX_CTL */
	0xfffff169,
	/* PA_CL_GB_VERT_CLIP_ADJ */
	0xfffff170,
	/* PA_CL_GB_VERT_DISC_ADJ */
	0xfffff171,
	/* PA_CL_GB_HORZ_CLIP_ADJ */
	0xfffff172,
	/* PA_CL_GB_HORZ_DISC_ADJ */
	0xfffff173,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_1 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_2 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_3 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_1 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_2 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_3 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_1 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_2 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_3 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_1 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_2 */
	0x00000000,
	/* PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_3 */
	0x00000000,
	/* PA_SC_AA_MASK_X0Y0_X1Y0 */
	0xfffff190,
	/* PA_SC_AA_MASK_X0Y1_X1Y1 */
	0xfffff191,
/*----------------------------------------------------------------------------*/
	PKT3(PKT3_SET_CTX_REG, 3),
	CTX_REG_IDX(VGT_VTX_REUSE_BLK_CTL),
	/* VGT_VTX_REUSE_BLK_CTL */
	0xfffff194,
	/* VGT_OUT_DEALLOC_CTL */
	0xfffff195
};

static void ctx_clr_state_init(void)
{
	ctx_clr_state[5] = set(DRO_FORCE_HIZ_ENA, DRO_FORCE_DIS)
				| set(DRO_FORCE_HIS_ENA_0, DRO_FORCE_DIS)
				| set(DRO_FORCE_HIS_ENA_1, DRO_FORCE_DIS);
	ctx_clr_state[24] = set(PSCR_CLIP_RULE, 0xffff);
	/* XXX: should probably be 16384 on SI */
	ctx_clr_state[26] = set(PSCB_X, 8192) | set(PSCB_Y, 8192);
	ctx_clr_state[28] = set(PSCB_X, 8192) | set(PSCB_Y, 8192);
	ctx_clr_state[30] = set(PSCB_X, 8192) | set(PSCB_Y, 8192);
	ctx_clr_state[32] = set(PSCB_X, 8192) | set(PSCB_Y, 8192);
	ctx_clr_state[33] = set(PSE_ER_TRI, 0xa)| set(PSE_ER_POINT, 0xa)
		| set(PSE_ER_RECT, 0xa) | set(PSE_ER_LINE_LR, 0x8a)
		| set(PSE_ER_LINE_RL, 0xa2) | set(PSE_ER_LINE_TB, 0xa)
						| set(PSE_ER_LINE_BT, 0xa);
	/* CB_TGT_MASK: CB 0 will output all 4 color components to target */
	ctx_clr_state[35] = set(CTM_TGT_0_ENA, 0xf);
	/*
	 * CB_SHADER_MASK: CB 0 will use the 4 color components from
	 * the pixel/fragment shader
	 */
	ctx_clr_state[36] = set(CSM_OUTPUT_0_ENA, 0xf);
	/* XXX: should probably be 16384 on SI */
	ctx_clr_state[40] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[42] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[44] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[46] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[48] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[50] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[52] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[54] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[56] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[58] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[60] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[62] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[64] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[66] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[68] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[70] = set(PSVSB_X, 8192) | set(PSVSB_Y, 8192);
	ctx_clr_state[72] = f2u(1.0f);
	ctx_clr_state[96] = set(CCC_MODE, CCC_CB_NORMAL)
						| set(CCC_ROP3, CCC_0XCC);
	ctx_clr_state[97] = set(DSC_Z_ORDER, DSC_EARLY_Z_THEN_LATE_Z)
							| DSC_EXEC_ON_HIER_FAIL;
	ctx_clr_state[112] = set(PSLC_W, 8);
	ctx_clr_state[150] = set(DATM_ALPHA_TO_MASK_OF_0, 2)
					| set(DATM_ALPHA_TO_MASK_OF_1, 2)
					| set(DATM_ALPHA_TO_MASK_OF_2, 2)
					| set(DATM_ALPHA_TO_MASK_OF_3, 2);
	ctx_clr_state[165] = set(PSCP_DISTANCE_0, 0) | set(PSCP_DISTANCE_1, 1)
			| set(PSCP_DISTANCE_2, 2) | set(PSCP_DISTANCE_3, 3)
			| set(PSCP_DISTANCE_4, 4) | set(PSCP_DISTANCE_5, 5)
			| set(PSCP_DISTANCE_6, 6) | set(PSCP_DISTANCE_7, 7);
	ctx_clr_state[166] = set(PSCP_DISTANCE_8, 8) | set(PSCP_DISTANCE_9, 9)
			| set(PSCP_DISTANCE_A, 0xa) | set(PSCP_DISTANCE_B, 0xb)
			| set(PSCP_DISTANCE_C, 0xc) | set(PSCP_DISTANCE_D, 0xd)
			| set(PSCP_DISTANCE_E, 0xe) | set(PSCP_DISTANCE_F, 0xf);
	ctx_clr_state[169] = PSVC_PIX_CENTER
				| set(PSVC_ROUND_MODE, PSVC_ROUND_TO_EVEN);
	/* PA_CL_GB_...: disable GB (Guard Band) = setting 1.0f value */
	ctx_clr_state[170] = f2u(1.0f);
	ctx_clr_state[171] = f2u(1.0f);
	ctx_clr_state[172] = f2u(1.0f);
	ctx_clr_state[173] = f2u(1.0f);
	ctx_clr_state[190] = set(PSAMXX_AA_MASK_X0Y0, 0xffff)
					| set(PSAMXX_AA_MASK_X1Y0, 0xffff);
	ctx_clr_state[191] = set(PSAMXX_AA_MASK_X0Y1, 0xffff)
					| set(PSAMXX_AA_MASK_X1Y1, 0xffff);
	ctx_clr_state[194] = set(VVRBC_VTX_REUSE_DEPTH, 0xe);
	ctx_clr_state[195] = set(VODC_DEALLOC_DIST, 0x10);
}

void cps_ctx_clr(struct pci_dev *dev)
{
	u32 i;

	ctx_clr_state_init();

	gpu_3d_ring_wr(dev, PKT3(PKT3_PREAMBLE_CTL, 1));
	gpu_3d_ring_wr(dev, PKT3_PREAMBLE_BEGIN_CLR_STATE);
	for (i = 0; i < ARRAY_SIZE(ctx_clr_state); ++i)
		gpu_3d_ring_wr(dev, ctx_clr_state[i]);
	gpu_3d_ring_wr(dev, PKT3(PKT3_PREAMBLE_CTL, 1));
	gpu_3d_ring_wr(dev, PKT3_PREAMBLE_END_CLR_STATE);
	gpu_3d_ring_wr(dev, PKT3(PKT3_CLR_STATE, 1));
	gpu_3d_ring_wr(dev, 0);
	gpu_3d_ring_commit(dev);

	/* compute state */
	gpu_3d_ring_wr(dev, PKT3_COMPUTE(PKT3_CLR_STATE, 1));
	gpu_3d_ring_wr(dev, 0);
	gpu_3d_ring_commit(dev);

	gpu_c0_ring_wr(dev, PKT3_COMPUTE(PKT3_CLR_STATE, 1));
	gpu_c0_ring_wr(dev, 0);
	gpu_c0_ring_commit(dev);

	gpu_c1_ring_wr(dev, PKT3_COMPUTE(PKT3_CLR_STATE, 1));
	gpu_c1_ring_wr(dev, 0);
	gpu_c1_ring_commit(dev);
}

void cps_intr_ena(struct pci_dev *dev)
{
	u32 cp_int_ctl_ring_0;
	/*
	 * do not touch the state of cp internal interrupts which are
	 * CNTX_BUSY and CNTX_EMPTY
	 */
	cp_int_ctl_ring_0 = rr32(dev, CP_INT_CTL_RING_0);
	cp_int_ctl_ring_0 &= CICR_CNTX_BUSY_INT_ENA | CICR_CNTX_EMPTY_INT_ENA;
	cp_int_ctl_ring_0 |= CICR_TIME_STAMP_INT_ENA;
	wr32(dev, cp_int_ctl_ring_0, CP_INT_CTL_RING_0);

	wr32(dev, CICR_TIME_STAMP_INT_ENA, CP_INT_CTL_RING_1);
	wr32(dev, CICR_TIME_STAMP_INT_ENA, CP_INT_CTL_RING_2);
}

/* works even if ucode is not loaded */
void cps_intr_reset(struct pci_dev *dev)
{
	u32 cp_int_ctl_ring_0;
	/*
	 * do not touch the state of cp internal interrupts which are
	 * CNTX_BUSY and CNTX_EMPTY. It's done in gpu_3d_intr_idle_ena and
	 * gpu_3d_intr_idle_dis
	 */
	cp_int_ctl_ring_0 = rr32(dev, CP_INT_CTL_RING_0);
	cp_int_ctl_ring_0 &= CICR_CNTX_BUSY_INT_ENA | CICR_CNTX_EMPTY_INT_ENA;
	wr32(dev, cp_int_ctl_ring_0, CP_INT_CTL_RING_0);

	wr32(dev, 0, CP_INT_CTL_RING_1);
	wr32(dev, 0, CP_INT_CTL_RING_2);
}

static long pfp_ucode_load(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	return ucode_load(dev, &(dd->pfp_fw), "pfp", PFP_FW_DWS);
}

static void pfp_ucode_program(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	const __be32 *fw_data;
	u32 i;

	dd = pci_get_drvdata(dev);

	fw_data = (const __be32 *)dd->pfp_fw->data;
	wr32(dev, 0,CP_PFP_UCODE_ADDR);
	for (i = 0; i < PFP_FW_DWS; ++i)
		wr32(dev, be32_to_cpup(fw_data++), CP_PFP_UCODE_DATA);
	wr32(dev, 0, CP_PFP_UCODE_ADDR);
}

static long ce_ucode_load(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	return ucode_load(dev, &(dd->ce_fw), "ce", CE_FW_DWS);
}

static void ce_ucode_program(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	const __be32 *fw_data;
	u32 i;

	dd = pci_get_drvdata(dev);

	fw_data = (const __be32 *)dd->ce_fw->data;
	wr32(dev, 0, CP_CE_UCODE_ADDR);
	for (i = 0; i < CE_FW_DWS; ++i)
		wr32(dev, be32_to_cpup(fw_data++), CP_CE_UCODE_DATA);
	wr32(dev, 0, CP_CE_UCODE_ADDR);
}

static long me_ucode_load(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	return ucode_load(dev, &(dd->me_fw), "me", ME_FW_DWS);
}

static void me_ucode_program(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	const __be32 *fw_data;
	u32 i;

	dd = pci_get_drvdata(dev);

	fw_data = (const __be32 *)dd->me_fw->data;
	wr32(dev, 0, CP_ME_RAM_WADDR);
	for (i = 0; i < ME_FW_DWS; ++i)
		wr32(dev, be32_to_cpup(fw_data++), CP_ME_RAM_DATA);
	wr32(dev, 0, CP_ME_RAM_WADDR);
}

long cps_engines_ucode_load(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	long r;

	dd = pci_get_drvdata(dev);

	r = pfp_ucode_load(dev);
	if (r == -SI_ERR)
		goto err;

	r = ce_ucode_load(dev);
	if (r == -SI_ERR)
		goto err_release_pfp_fw;

	r = me_ucode_load(dev);
	if (r == -SI_ERR)
		goto err_release_ce_fw;
	return 0;

err_release_ce_fw:
	release_firmware(dd->ce_fw);

err_release_pfp_fw:
	release_firmware(dd->pfp_fw);

err:
	return -SI_ERR;
}

void cps_engines_ucode_program(struct pci_dev *dev)
{
	pfp_ucode_program(dev);
	ce_ucode_program(dev);
	me_ucode_program(dev);

	wr32(dev, 0, CP_PFP_UCODE_ADDR); 
	wr32(dev, 0, CP_CE_UCODE_ADDR); 
	wr32(dev, 0, CP_ME_RAM_WADDR);
	wr32(dev, 0, CP_ME_RAM_RADDR);
}

void cps_engines_stop(struct pci_dev *dev)
{
	wr32(dev, CMC_CP_CE_HALT | CMC_CP_ME_HALT | CMC_CP_PFP_HALT, CP_ME_CTL);
	wr32(dev, 0, SCRATCH_UMSK);
	udelay(50);
}

void cps_enable(struct pci_dev *dev)
{
	wr32(dev, 0, CP_ME_CTL);
	udelay(50);
}

/* should be at the very beginning of the ring */
void cps_me_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	dd = pci_get_drvdata(dev);

	/* common to all cps, done by gfx */
	gpu_3d_ring_wr(dev, PKT3(PKT3_ME_INIT, 6));
	gpu_3d_ring_wr(dev, 0x1);
	gpu_3d_ring_wr(dev, 0x0);
	gpu_3d_ring_wr(dev, dd->cfg.gpu.hw_ctxs_n - 1);
	gpu_3d_ring_wr(dev, set(PKT3_ME_INIT_DEV_ID, 1));
	gpu_3d_ring_wr(dev, 0);
	gpu_3d_ring_wr(dev, 0);

	/* specific for the ce engine */
	gpu_3d_ring_wr(dev, PKT3(PKT3_SET_BASE, 3));
	gpu_3d_ring_wr(dev, set(PKT3_BASE_IDX, PKT3_CE_PARTITION_BASE));
	gpu_3d_ring_wr(dev, 0xc000);
	gpu_3d_ring_wr(dev, 0xe000);

	gpu_3d_ring_commit(dev);
}

/*
 * o ring size is 2^CP_RING_LOG2_QWS(17) quadwords (256 * 4096 bytes)
 * o block size is gpu page size, namely 2^GPU_PAGE_LOG2_QWS(9) quadwords
 *   (4096 bytes)
 */

/* ring 0 GFX */
static void gpu_3d_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 cp_rb_0_ctl;
	u32 cp_rb_0_rptr_addr;

	/* set ring buffer size */
	cp_rb_0_ctl = set(CRC_RB_BLK_LOG2_QWS, GPU_PAGE_LOG2_QWS)
				| set(CRC_RB_BUF_LOG2_QWS, CP_RING_LOG2_QWS);
	wr32(dev, cp_rb_0_ctl, CP_RB_0_CTL);

	/* initialize the ring buffer's read and write pointers */
	wr32(dev, cp_rb_0_ctl | CRC_RB_RPTR_WR_ENA, CP_RB_0_CTL);
	wr32(dev, 0, CP_RB_0_WPTR);
	wr32(dev, 0, CP_RB_0_RPTR);

	dd = pci_get_drvdata(dev);

	/* set the wb address, 2 lower bits are for endianness */
	cp_rb_0_rptr_addr = dd->ba.wb_map->gpu_addr + WB_GPU_3D_RPTR_OF;

	/* must be dw aligned */	
	wr32(dev, lower_32_bits(cp_rb_0_rptr_addr), CP_RB_0_RPTR_ADDR_LO);
	wr32(dev, upper_32_bits(cp_rb_0_rptr_addr), CP_RB_0_RPTR_ADDR_HI);

	fence_init(&dd->gpu_3d.fence, dd->ba.wb_map->cpu_addr
							+ WB_GPU_3D_RPTR_OF);

	wr32(dev, 0xff, SCRATCH_UMSK); /* specific to gfx? Not global? */
	mdelay(1);
	wr32(dev, cp_rb_0_ctl, CP_RB_0_CTL);

	/* 256 bytes aligned ok because it is GPU_PAGE_SZ aligned */
	wr32(dev, dd->ba.gpu_3d_ring_map->gpu_addr >> 8, CP_RB_0_BASE);

	dd->gpu_3d.wptr = 0;
}

/* ring 1 compute only */
static void gpu_c0_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 cp_rb_1_ctl;
	u32 cp_rb_1_rptr_addr;

	/* set ring buffer size */
	cp_rb_1_ctl = set(CRC_RB_BLK_LOG2_QWS, GPU_PAGE_LOG2_QWS)
				| set(CRC_RB_BUF_LOG2_QWS, CP_RING_LOG2_QWS);
	wr32(dev, cp_rb_1_ctl, CP_RB_1_CTL);

	/* initialize the ring buffer's read and write pointers */
	wr32(dev, cp_rb_1_ctl | CRC_RB_RPTR_WR_ENA, CP_RB_1_CTL);
	wr32(dev, 0, CP_RB_1_WPTR);
	wr32(dev, 0, CP_RB_1_RPTR);

	dd = pci_get_drvdata(dev);

	/* set the wb address, 2 lower bits are for endianness */
	cp_rb_1_rptr_addr = dd->ba.wb_map->gpu_addr + WB_GPU_C_0_RPTR_OF;

	/* must be dword aligned */	
	wr32(dev, lower_32_bits(cp_rb_1_rptr_addr), CP_RB_1_RPTR_ADDR_LO);
	wr32(dev, upper_32_bits(cp_rb_1_rptr_addr), CP_RB_1_RPTR_ADDR_HI);

	fence_init(&dd->gpu_c0.fence, dd->ba.wb_map->cpu_addr
							+ WB_GPU_C_0_RPTR_OF);

	mdelay(1);
	wr32(dev, cp_rb_1_ctl, CP_RB_1_CTL);

	/* 256 bytes aligned ok because it is GPU_PAGE_SZ aligned */
	wr32(dev, dd->ba.gpu_c0_ring_map->gpu_addr >> 8, CP_RB_1_BASE);

	dd->gpu_c0.wptr = 0;
}

/* ring 2 compute only */
static void gpu_c1_init(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 cp_rb_2_ctl;
	u32 cp_rb_2_rptr_addr;

	/* set ring buffer size */
	cp_rb_2_ctl = set(CRC_RB_BLK_LOG2_QWS, GPU_PAGE_LOG2_QWS)
				| set(CRC_RB_BUF_LOG2_QWS, CP_RING_LOG2_QWS);
	wr32(dev, cp_rb_2_ctl, CP_RB_2_CTL);

	/* initialize the ring buffer's read and write pointers */
	wr32(dev, cp_rb_2_ctl | CRC_RB_RPTR_WR_ENA, CP_RB_2_CTL);
	wr32(dev, 0, CP_RB_2_WPTR);
	wr32(dev, 0, CP_RB_2_RPTR);

	dd = pci_get_drvdata(dev);

	/* set the wb address, 2 lower bits are for endianness */
	cp_rb_2_rptr_addr = dd->ba.wb_map->gpu_addr + WB_GPU_C_1_RPTR_OF;

	/* must be dw aligned */	
	wr32(dev, lower_32_bits(cp_rb_2_rptr_addr), CP_RB_2_RPTR_ADDR_LO);
	wr32(dev, upper_32_bits(cp_rb_2_rptr_addr), CP_RB_2_RPTR_ADDR_HI);

	fence_init(&dd->gpu_c1.fence, dd->ba.wb_map->cpu_addr
							+ WB_GPU_C_1_RPTR_OF);

	mdelay(1);
	wr32(dev, cp_rb_2_ctl, CP_RB_2_CTL);

	/* 256 bytes aligned ok because it is GPU_PAGE_SZ aligned */
	wr32(dev, dd->ba.gpu_c0_ring_map->gpu_addr >> 8, CP_RB_2_BASE);

	dd->gpu_c1.wptr = 0;
}

static void cps_reset(struct pci_dev *dev)
{
	/* reset cp; if cp is reset, then pa, sh, vgt also need to be reset */
	wr32(dev, GSR_SOFT_RESET_CP | GSR_SOFT_RESET_PA | GSR_SOFT_RESET_SPI
		| GSR_SOFT_RESET_VGT | GSR_SOFT_RESET_SX, GRBM_SOFT_RESET);
	rr32(dev, GRBM_SOFT_RESET);
	mdelay(15);
	wr32(dev, 0, GRBM_SOFT_RESET);
	rr32(dev, GRBM_SOFT_RESET);
}

/*----------------------------------------------------------------------------*/
static u32 gpu_3d_rptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return le32_to_cpup(dd->ba.wb_map->cpu_addr + WB_GPU_3D_RPTR_OF);
}

static u32 gpu_3d_wptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return dd->gpu_3d.wptr;
}

static void gpu_3d_init_once(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	spin_lock_init(&dd->gpu_3d.lock);

	fence_init_once(&dd->gpu_3d.fence);

	dd->gpu_3d.ring.dev = dev;
	dd->gpu_3d.ring.pf_dw_mask = CP_RING_PFP_DW_MASK;
	dd->gpu_3d.ring.ring_dws_n = 1 << CP_RING_LOG2_DWS;
	dd->gpu_3d.ring.rptr_dw_get = gpu_3d_rptr_dw_get;
	dd->gpu_3d.ring.wptr_dw_get = gpu_3d_wptr_dw_get;
}
/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
static u32 gpu_c0_rptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return le32_to_cpup(dd->ba.wb_map->cpu_addr + WB_GPU_C_0_RPTR_OF);
}

static u32 gpu_c0_wptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return dd->gpu_c0.wptr;
}

static void gpu_c0_init_once(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	spin_lock_init(&dd->gpu_c0.lock);

	fence_init_once(&dd->gpu_c0.fence);

	dd->gpu_c0.ring.dev = dev;
	dd->gpu_c0.ring.pf_dw_mask = CP_RING_PFP_DW_MASK;
	dd->gpu_c0.ring.ring_dws_n = 1 << CP_RING_LOG2_DWS;
	dd->gpu_c0.ring.rptr_dw_get = gpu_c0_rptr_dw_get;
	dd->gpu_c0.ring.wptr_dw_get = gpu_c0_wptr_dw_get;
}
/*----------------------------------------------------------------------------*/

/*----------------------------------------------------------------------------*/
static u32 gpu_c1_rptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return le32_to_cpup(dd->ba.wb_map->cpu_addr + WB_GPU_C_1_RPTR_OF);
}

static u32 gpu_c1_wptr_dw_get(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);
	return dd->gpu_c1.wptr;
}

static void gpu_c1_init_once(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	spin_lock_init(&dd->gpu_c1.lock);

	fence_init_once(&dd->gpu_c1.fence);

	dd->gpu_c1.ring.dev = dev;
	dd->gpu_c1.ring.pf_dw_mask = CP_RING_PFP_DW_MASK;
	dd->gpu_c1.ring.ring_dws_n = 1 << CP_RING_LOG2_DWS;
	dd->gpu_c1.ring.rptr_dw_get = gpu_c1_rptr_dw_get;
	dd->gpu_c1.ring.wptr_dw_get = gpu_c1_wptr_dw_get;
}
/*----------------------------------------------------------------------------*/

void cps_init_once(struct pci_dev *dev)
{
	struct dev_drv_data *dd;

	dd = pci_get_drvdata(dev);

	gpu_3d_init_once(dev);
	gpu_c0_init_once(dev);
	gpu_c1_init_once(dev);
}

void cps_init(struct pci_dev *dev)
{
	u64 wb_scratch_addr;
	struct dev_drv_data *dd;

	/* XXX: may have to be set before ucode loading */
	wr32(dev, set(CQT_ROQ_IB_0_START, 0x16) | set(CQT_ROQ_IB_1_START, 0x2b),
							CP_QUEUE_THRESHOLDS);
	wr32(dev, set(CMT_MEQ_0_START, 0x30) | set(CMT_MEQ_1_START, 0x60),
							CP_MEQ_THRESHOLDS);
	wr32(dev, 0, CP_PERFMON_CTL);

	cps_reset(dev);

	wr32(dev, 0, CP_SEM_WAIT_TIMER);
	wr32(dev, 0, CP_SEM_INCOMPLETE_TIMER_CTL);

	wr32(dev, 0, CP_RBS_WPTR_DELAY);

	wr32(dev, 0, CP_DEBUG);

	dd = pci_get_drvdata(dev);

	wb_scratch_addr = dd->ba.wb_map->gpu_addr + WB_SCRATCH_OF;
	/*
	 * 256 bytes block index is ok because gpu address and chosen write back
	 * page offset fit properly that required aligment
	 */
	wr32(dev, (wb_scratch_addr >> 8) & 0xffffffff, SCRATCH_ADDR);

	gpu_3d_init(dev);
	gpu_c0_init(dev);
	gpu_c1_init(dev);
}

void cps_ls_dis(struct pci_dev *dev)
{
	u32 cp_mem_slp_ctl;

	cp_mem_slp_ctl = rr32(dev, CP_MEM_SLP_CTL);
	if (cp_mem_slp_ctl & CMSC_CP_MEM_LS_ENA) {
		cp_mem_slp_ctl &= ~CMSC_CP_MEM_LS_ENA;
		wr32(dev, cp_mem_slp_ctl, CP_MEM_SLP_CTL);
	}
}

void cps_ls_ena(struct pci_dev *dev)
{
	u32 cur;
	u32 want;

	cur = rr32(dev, CP_MEM_SLP_CTL);
	want = cur | CMSC_CP_MEM_LS_ENA;
	if (cur != want)
		wr32(dev, want, CP_MEM_SLP_CTL);
}

void gpu_3d_ring_wr(struct pci_dev *dev, u32 v)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	r = dd->ba.gpu_3d_ring_map->cpu_addr;
	r[dd->gpu_3d.wptr++] = cpu_to_le32(v);
	dd->gpu_3d.wptr &= CP_RING_DW_MASK;
}

void gpu_3d_ring_commit(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	/* match ring fetch alignment */
	r = dd->ba.gpu_3d_ring_map->cpu_addr;
	while (dd->gpu_3d.wptr & CP_RING_PFP_DW_MASK)
		r[dd->gpu_3d.wptr++] = cpu_to_le32(PKT2);

	wmb();	/* data write operations emitted before dma */

	dd->gpu_3d.wptr &= CP_RING_DW_MASK;
	wr32(dev, dd->gpu_3d.wptr, CP_RB_0_WPTR);
	rr32(dev, CP_RB_0_WPTR);
}

/* the 2 following functions deal with cp ring 0 internal interrupts */
void gpu_3d_ring_intr_idle_ena(struct pci_dev *dev)
{
	u32 cp_int_ctl_ring_0;

	cp_int_ctl_ring_0 = rr32(dev, CP_INT_CTL_RING_0);

	cp_int_ctl_ring_0 |= (CICR_CNTX_BUSY_INT_ENA | CICR_CNTX_EMPTY_INT_ENA);

	wr32(dev, cp_int_ctl_ring_0, CP_INT_CTL_RING_0);
}

void gpu_3d_ring_intr_idle_dis(struct pci_dev *dev)
{
	u32 cp_int_ctl_ring_0;
	u32 db_depth_info;

	cp_int_ctl_ring_0 = rr32(dev, CP_INT_CTL_RING_0);

	cp_int_ctl_ring_0 &= ~(CICR_CNTX_BUSY_INT_ENA
						| CICR_CNTX_EMPTY_INT_ENA);

	wr32(dev, cp_int_ctl_ring_0, CP_INT_CTL_RING_0);

	/* read a gfx register (ctx) */
	db_depth_info = rr32(dev, DB_DEPTH_INFO);

	rlc_wait(dev);
}

void gpu_c0_ring_wr(struct pci_dev *dev, u32 v)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	r = dd->ba.gpu_c0_ring_map->cpu_addr;
	r[dd->gpu_c0.wptr++] = cpu_to_le32(v);
	dd->gpu_c0.wptr &= CP_RING_DW_MASK;
}

void gpu_c0_ring_commit(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	/* match ring fetch alignment */
	r = dd->ba.gpu_c0_ring_map->cpu_addr;
	while (dd->gpu_c0.wptr & CP_RING_PFP_DW_MASK)
		r[dd->gpu_c0.wptr++] = cpu_to_le32(PKT2);

	wmb();	/* data write operations emitted before dma */

	dd->gpu_c0.wptr &= CP_RING_DW_MASK;
	wr32(dev, dd->gpu_c0.wptr, CP_RB_1_WPTR);
	rr32(dev, CP_RB_1_WPTR);
}

void gpu_c1_ring_wr(struct pci_dev *dev, u32 v)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	r = dd->ba.gpu_c1_ring_map->cpu_addr;
	r[dd->gpu_c1.wptr++] = cpu_to_le32(v);
	dd->gpu_c1.wptr &= CP_RING_DW_MASK;
}

void gpu_c1_ring_commit(struct pci_dev *dev)
{
	struct dev_drv_data *dd;
	u32 __iomem *r;

	dd = pci_get_drvdata(dev);

	/* match ring fetch alignment */
	r = dd->ba.gpu_c1_ring_map->cpu_addr;
	while (dd->gpu_c1.wptr & CP_RING_PFP_DW_MASK)
		r[dd->gpu_c1.wptr++] = cpu_to_le32(PKT2);

	wmb();	/* data write operations emitted before dma */

	dd->gpu_c1.wptr &= CP_RING_DW_MASK;
	wr32(dev, dd->gpu_c1.wptr, CP_RB_2_WPTR);
	rr32(dev, CP_RB_2_WPTR);
}
